Statistical inference
The theory, methods, and practice of forming judgments about the parameters of a population and the reliability of statistical relationships, typically on the basis of random sampling
credit: Google
Purpose of this section:
Why do all of this in one place?
# Libraries
#install.packages('tidyverse') # if you haven't installed already
#install.packages('labelled') # if you haven't installed already
#install.packages('patchwork') # if you haven't installed already
library(tidyverse) # load tidyverse package
library(labelled) # load labelled package package
library(patchwork)
##########
########## IPEDS
##########
# Load ipeds dataset from course website url
load(file = url('https://github.com/anyone-can-cook/educ152/raw/main/data/ipeds/output_data/panel_data.RData'))
# Create ipeds data frame with fewer variables/observations
df_ipeds_pop <- panel_data %>%
# keep data from fall 2019
filter(year == 2019) %>%
# which universities to keep:
# 2015 carnegie classification: keep research universities (15,16,17) and master's universities (18,19,20)
filter(c15basic %in% c(15,16,17,18,19,20)) %>%
# which variables to keep
select(instnm,unitid,opeid6,opeid,control,c15basic,stabbr,city,zip,locale,obereg, # basic institutional characteristics
tuition6,fee6,tuition7,fee7, # avg tuition and fees for full-time grad, in-state and out-of-state
isprof3,ispfee3,osprof3,ospfee3, # avg tuition and fees for MD, in-state and out-of-state
isprof9,ispfee9,osprof9,ospfee9, # avg tuition and fees for Law, in-state and out-of-state
chg4ay3,chg7ay3,chg8ay3) %>% # [undergraduate] books+supplies; off-campus (not with family) room and board; off-campus (not with family) other expenses
# rename variables; syntax <new_name> = <old_name>
rename(region = obereg, # revion
tuit_grad_res = tuition6, fee_grad_res = fee6, tuit_grad_nres = tuition7, fee_grad_nres = fee7, # grad
tuit_md_res = isprof3, fee_md_res = ispfee3, tuit_md_nres = osprof3, fee_md_nres = ospfee3, # md
tuit_law_res = isprof9, fee_law_res = ispfee9, tuit_law_nres = osprof9, fee_law_nres = ospfee9, # law
books_supplies = chg4ay3, roomboard_off = chg7ay3, oth_expense_off = chg8ay3) %>% # [undergraduate] expenses
# create measures of tuition+fees
mutate(
tuitfee_grad_res = tuit_grad_res + fee_grad_res, # graduate, state resident
tuitfee_grad_nres = tuit_grad_nres + fee_grad_nres, # graduate, non-resident
tuitfee_md_res = tuit_md_res + fee_md_res, # MD, state resident
tuitfee_md_nres = tuit_md_nres + fee_md_nres, # MD, non-resident
tuitfee_law_res = tuit_law_res + fee_law_res, # Law, state resident
tuitfee_law_nres = tuit_law_nres + fee_law_nres) %>% # Law, non-resident
# create measures of cost-of-attendance (COA) as the sum of tuition, fees, book, living expenses
mutate(
coa_grad_res = tuit_grad_res + fee_grad_res + books_supplies + roomboard_off + oth_expense_off, # graduate, state resident
coa_grad_nres = tuit_grad_nres + fee_grad_nres + books_supplies + roomboard_off + oth_expense_off, # graduate, non-resident
coa_md_res = tuit_md_res + fee_md_res + books_supplies + roomboard_off + oth_expense_off, # MD, state resident
coa_md_nres = tuit_md_nres + fee_md_nres + books_supplies + roomboard_off + oth_expense_off, # MD, non-resident
coa_law_res = tuit_law_res + fee_law_res + books_supplies + roomboard_off + oth_expense_off, # Law, state resident
coa_law_nres = tuit_law_nres + fee_law_nres + books_supplies + roomboard_off + oth_expense_off) %>% # Law, non-resident
# keep only observations that have non-missing values for the variable coa_grad_res
# this does cause us to lose some interesting universities, but doing this will eliminate some needless complications with respect to learning core concepts about statistical inference
filter(!is.na(coa_grad_res))
# Add variable labels to the tuit+fees variables and coa variables
# tuition + fees variables
var_label(df_ipeds_pop[['tuitfee_grad_res']]) <- 'graduate, full-time, resident; avg tuition + required fees'
var_label(df_ipeds_pop[['tuitfee_grad_nres']]) <- 'graduate, full-time, non-resident; avg tuition + required fees'
var_label(df_ipeds_pop[['tuitfee_md_res']]) <- 'MD, full-time, state resident; avg tuition + required fees'
var_label(df_ipeds_pop[['tuitfee_md_nres']]) <- 'MD, full-time, non-resident; avg tuition + required fees'
var_label(df_ipeds_pop[['tuitfee_law_res']]) <- 'Law, full-time, state resident; avg tuition + required fees'
var_label(df_ipeds_pop[['tuitfee_law_nres']]) <- 'Law, full-time, non-resident; avg tuition + required fees'
# COA variables
var_label(df_ipeds_pop[['coa_grad_res']]) <- 'graduate, full-time, state resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
var_label(df_ipeds_pop[['coa_grad_nres']]) <- 'graduate, full-time, non-resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
var_label(df_ipeds_pop[['coa_md_res']]) <- 'MD, full-time, state resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
var_label(df_ipeds_pop[['coa_md_nres']]) <- 'MD, full-time, non-resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
var_label(df_ipeds_pop[['coa_law_res']]) <- 'Law, full-time, state resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
var_label(df_ipeds_pop[['coa_law_nres']]) <- 'Law, full-time, non-resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
df_ipeds_pop %>% head()#> # A tibble: 6 x 38
#> instnm unitid opeid6 opeid control c15basic stabbr city zip locale region tuit_grad_res fee_grad_res tuit_grad_nres fee_grad_nres tuit_md_res fee_md_res tuit_md_nres fee_md_nres tuit_law_res fee_law_res tuit_law_nres fee_law_nres books_supplies roomboard_off oth_expense_off tuitfee_grad_res tuitfee_grad_nres tuitfee_md_res tuitfee_md_nres tuitfee_law_res tuitfee_law_nres coa_grad_res coa_grad_nres coa_md_res coa_md_nres coa_law_res coa_law_nres
#> <chr> <dbl> <chr> <chr> <dbl+lbl> <dbl+lbl> <chr+lbl> <chr> <chr> <dbl+lbl> <dbl+lbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Alabama A & M University 100654 001002 00100200 1 [Public] 18 [Master^s Colleges & Universities: Larger Programs] AL [Alabama] Normal 35762 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 10128 1414 20160 1414 NA NA NA NA NA NA NA NA 1600 9240 3090 11542 21574 NA NA NA NA 25472 35504 NA NA NA NA
#> 2 University of Alabama at Birmingham 100663 001052 00105200 1 [Public] 15 [Doctoral Universities: Highest Research Activity] AL [Alabama] Birmingham 35294-0110 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 8100 0 19188 0 28978 0 62714 0 NA NA NA NA 1200 12307 5555 8100 19188 28978 62714 NA NA 27162 38250 48040 81776 NA NA
#> 3 Amridge University 100690 025034 02503400 2 [Private not-for-profit] 20 [Master^s Colleges & Universities: Small Programs] AL [Alabama] Montgomery 36117-3553 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 11700 1300 11700 1300 NA NA NA NA NA NA NA NA 900 9600 1600 13000 13000 NA NA NA NA 25100 25100 NA NA NA NA
#> 4 University of Alabama in Huntsville 100706 001055 00105500 1 [Public] 16 [Doctoral Universities: Higher Research Activity] AL [Alabama] Huntsville 35899 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 10632 826 24430 826 NA NA NA NA NA NA NA NA 2120 10400 3994 11458 25256 NA NA NA NA 27972 41770 NA NA NA NA
#> 5 Alabama State University 100724 001005 00100500 1 [Public] 19 [Master^s Colleges & Universities: Medium Programs] AL [Alabama] Montgomery 36104-0271 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 7416 2740 14832 2740 NA NA NA NA NA NA NA NA 1600 7320 4228 10156 17572 NA NA NA NA 23304 30720 NA NA NA NA
#> 6 The University of Alabama 100751 001051 00105100 1 [Public] 16 [Doctoral Universities: Higher Research Activity] AL [Alabama] Tuscaloosa 35487-0100 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 10780 0 30250 0 28978 0 62714 0 23610 0 43060 0 1000 13636 4600 10780 30250 28978 62714 23610 43060 30016 49486 48214 81950 42846 62296
##########
########## Create data frame of generated variables, with each variable meant to represent the entire population
##########
num_obs <- 10000
# Generate normal distribution w/ custom mean and sd
set.seed(124)
norm_dist <- rnorm(n = num_obs, mean = 50, sd = 5)
# Generate right-skewed distribution
set.seed(124)
rskew_dist <- rbeta(n = num_obs, shape1 = 2, shape2 = 5)
# Generate left-skewed distribution
set.seed(124)
lskew_dist <- rbeta(n = num_obs, shape1 = 5, shape2 = 2)
# Generate standard normal distribution (default is mean = 0 and sd = 1)
set.seed(124)
stdnorm_dist <- rnorm(n = num_obs, mean = 0, sd = 1) # equivalent to rnorm(10)
# Create dataframe
df_generated_pop <- data.frame(norm_dist, rskew_dist, lskew_dist, stdnorm_dist) %>% as.tibble()
# drop individual objects associated with each variable
rm(norm_dist,rskew_dist,lskew_dist,stdnorm_dist)
rm(num_obs)
##########
########## Create sample versions of generated population data frame and IPEDS population data frame
##########
# create sample version of our generated data
set.seed(124) # set seed so that everyone ends up with the same random sample
df_generated_sample <- df_generated_pop %>% sample_n(size = 200)
df_generated_sample %>% head()#> # A tibble: 6 x 4
#> norm_dist rskew_dist lskew_dist stdnorm_dist
#> <dbl> <dbl> <dbl> <dbl>
#> 1 42.7 0.345 0.655 -1.46
#> 2 50.2 0.314 0.686 0.0488
#> 3 61.3 0.0938 0.906 2.26
#> 4 45.5 0.0558 0.944 -0.905
#> 5 44.7 0.0745 0.926 -1.05
#> 6 48.0 0.301 0.699 -0.402
# create sample version of our ipeds data
set.seed(124) # set seed so that everyone ends up with the same random sample
df_ipeds_sample <- df_ipeds_pop %>% sample_n(size = 200)
# compare mean of coa_grad_res between population and sample
mean(df_ipeds_pop$coa_grad_res, na.rm = TRUE)#> [1] 30318.74
mean(df_ipeds_sample$coa_grad_res, na.rm = TRUE)#> [1] 30002.74
##########
# Write function to get the sampling distribution from a variable (defaults equal 1000 samples of size 200)
##########
get_sampling_distribution <- function(data_vec, num_samples = 1000, sample_size = 200) {
sample_means <- vector(mode = 'numeric', num_samples)
for (i in 1:length(sample_means)) {
samp <- sample(data_vec, sample_size)
sample_means[[i]] <- mean(samp, na.rm = T)
}
sample_means
}
##########
# Create function to generate plots of variable distributions
##########
plot_distribution <- function(data_df, data_var, group_var = NULL, group_cat = NULL, show_group_hist = F, sampling_dist = F, plot_title = '') {
# Prep dataframe
two_pop <- !is.null(group_var)
two_dist <- !is.null(group_var) && !sampling_dist
data_df[[data_var]] <- unclass(data_df[[data_var]]) # unclass haven_labelled
if (two_pop) {
data_df[[group_var]] <- unclass(data_df[[group_var]])
}
data_df <- data_df %>% filter(!is.na(get(data_var))) # remove NA rows
# If group_cat not provided, use 2 values from group_var
if (two_pop && is.null(group_cat)) {
group_cat <- sort(unique(na.omit(data_df[[group_var]])))[1:2]
}
# Create population vector(s)
if (!two_pop) { # single population
data_vec1 <- data_df[[data_var]]
if (sampling_dist) { # sampling distribution
data_vec1 <- get_sampling_distribution(data_vec1)
}
} else { # two populations
data_vec1 <- (data_df %>% filter(get(group_var) == group_cat[[1]]))[[data_var]]
data_vec2 <- (data_df %>% filter(get(group_var) == group_cat[[2]]))[[data_var]]
if (sampling_dist) { # sampling distribution
data_vec1_samp <- get_sampling_distribution(data_vec1)
data_vec2_samp <- get_sampling_distribution(data_vec2)
data_vec1 <- data_vec1_samp - data_vec2_samp
}
}
# Create statistics dataframe
if (!two_dist) {
lines_vec <- c('dotted')
stats_vec <- c(mean(data_vec1), median(data_vec1))
legend_title <- 'Statistics'
if (two_pop && sampling_dist) {
legend_title <- paste0('Statistics\n(', group_var, '=', group_cat[[1]], ' - ', group_var, '=', group_cat[[2]], ')')
}
} else {
lines_vec <- c('dotted', 'dotdash')
stats_vec <- c(mean(data_vec1), median(data_vec1), mean(data_vec2), median(data_vec2))
legend_title <- paste0('Statistics\n(', group_var, '=', group_cat[[1]], ' vs. ', group_var, '=', group_cat[[2]], ')')
}
stats_df <- data.frame(
pop = rep(lines_vec, each = 2),
stat = rep(c('blue', 'red'), times = if_else(two_dist, 2, 1)),
val = stats_vec
)
stats_df$pop <- factor(stats_df$pop, levels = c('dotted', 'dotdash'))
# Legend text
legend_text <- c(paste('Mean:', round(mean(data_vec1), 2),
'\nStd Dev:', round(sd(data_vec1), 2)),
paste('Median:', round(median(data_vec1), 2)))
if (two_dist) {
legend_text <- c(legend_text,
paste('Mean:', round(mean(data_vec2), 2),
'\nStd Dev:', round(sd(data_vec2), 2)),
paste('Median:', round(median(data_vec2), 2)))
}
# Plot distribution(s)
p <- ggplot() +
ggtitle(plot_title) + xlab('') + ylab('') +
geom_density(aes(x = data_vec1), alpha = 0.8)
if (!two_dist || show_group_hist) { # show histogram only if 1 pop or show_group_hist is TRUE
p <- p +
geom_histogram(aes(x = data_vec1, y = ..density..), alpha = 0.4, position = 'identity')
}
if (two_dist) {
p <- p +
geom_density(aes(x = data_vec2), alpha = 0.8)
if (show_group_hist) { # show histogram only if show_group_hist is TRUE
p <- p +
geom_histogram(aes(x = data_vec2, y = ..density..), alpha = 0.4, position = 'identity', fill = 'wheat4')
}
}
p <- p +
geom_vline(data = stats_df,
aes(xintercept = val, color = interaction(stat, pop), linetype = interaction(stat, pop)),
size = 0.6, alpha = 0.8) +
scale_color_manual(name = legend_title,
labels = legend_text,
values = as.character(stats_df$stat)) +
scale_linetype_manual(name = legend_title,
labels = legend_text,
values = as.character(stats_df$pop)) +
theme(plot.title = element_text(size = 10, face = 'bold', hjust = 0.5),
legend.title = element_text(size = 9, face = 'bold'),
legend.text = element_text(size = 8)) +
guides(col = guide_legend(ncol = if_else(two_dist, 2, 1)))
p
}
##########
# Write Function to generate sampling distribution (with t-test value) assuming null hypothesis is correct
##########
# Function to generate t-distribution plot
plot_t_distribution <- function(data_df, data_var, group_var = NULL, group_cat = NULL, mu = 0, alpha = 0.05, alternative = 'two.sided', plot_title = '', shade_rejection = T, shade_pval = T, stacked = F) {
# Prep dataframe
two_pop <- !is.null(group_var)
data_df[[data_var]] <- unclass(data_df[[data_var]]) # unclass haven_labelled
if (two_pop) {
data_df[[group_var]] <- unclass(data_df[[group_var]])
}
data_df <- data_df %>% filter(!is.na(get(data_var))) # remove NA rows
# If group_cat not provided, use 2 values from group_var
if (two_pop && is.null(group_cat)) {
group_cat <- sort(unique(na.omit(data_df[[group_var]])))[1:2]
}
# Calculate stats
if (!two_pop) { # single sample
data_vec <- data_df[[data_var]]
# Calculate t-statistics
sample_size <- length(data_vec)
deg_freedom <- sample_size - 1
xbar <- mean(data_vec)
s <- sd(data_vec)
std_err <- s / sqrt(sample_size)
t <- (xbar - mu) / std_err
} else { # two samples
data_vec1 <- (data_df %>% filter(get(group_var) == group_cat[[1]]))[[data_var]]
data_vec2 <- (data_df %>% filter(get(group_var) == group_cat[[2]]))[[data_var]]
# Calculate t-statistics
xbar1 <- mean(data_vec1)
xbar2 <- mean(data_vec2)
s1 <- sd(data_vec1)
s2 <- sd(data_vec2)
n1 <- length(data_vec1)
n2 <- length(data_vec2)
deg_freedom <- (s1**2/n1 + s2**2/n2)**2 / ((s1**2/n1)**2/(n1-1) + (s2**2/n2)**2/(n2-1))
std_err <- sqrt(s1**2/n1 + s2**2/n2)
t <- (xbar1 - xbar2) / std_err
}
# Calculate critical value and p-value
if (alternative == 'less') { # left-tailed
cv_lower <- qt(p = alpha, df = deg_freedom, lower.tail = T)
cv_legend <- round(cv_lower, 2)
cv_legend2 <- round(cv_lower * std_err + mu, 2)
pval <- round(pt(q = t, df = deg_freedom, lower.tail = T), 4)
} else if (alternative == 'greater') { # right-tailed
cv_upper <- qt(p = alpha, df = deg_freedom, lower.tail = F)
cv_legend <- round(cv_upper, 2)
cv_legend2 <- round(cv_upper * std_err + mu, 2)
pval <- round(pt(q = t, df = deg_freedom, lower.tail = F), 4)
} else { # two-tailed
cv_lower <- qt(p = alpha / 2, df = deg_freedom, lower.tail = T)
cv_upper <- qt(p = alpha / 2, df = deg_freedom, lower.tail = F)
cv_legend <- str_c('\u00B1', round(cv_upper, 2))
cv_legend2 <- str_c(round(cv_lower * std_err + mu, 2), ' & ', round(cv_upper * std_err + mu, 2))
pval_half <- round(pt(q = t, df = deg_freedom, lower.tail = t < 0), 4)
pval <- str_c(pval_half, ' + ', pval_half, ' = ', 2 * pval_half)
}
# Plot t-distribution
p <- ggplot(data.frame(x = -c(-4, 4)), aes(x)) +
ggtitle(plot_title) + xlab('') + ylab('') +
stat_function(fun = dt, args = list(df = deg_freedom), xlim = c(-4, 4))
# Shade rejection region using critical value
if (alternative != 'greater') {
p <- p + geom_vline(aes(xintercept = cv_lower, color = 'cval'),
linetype = 'dotted', size = 0.8, alpha = 0.8)
if (shade_rejection) {
p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
xlim = c(-4, cv_lower),
geom = 'area', alpha = 0.3, fill = 'red')
}
if (shade_pval) {
p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
xlim = c(-4, if_else(alternative == 'two.sided', -abs(t), t)),
geom = 'area', alpha = 0.3, fill = 'blue')
}
}
if (alternative != 'less') {
p <- p + geom_vline(aes(xintercept = cv_upper, color = 'cval'),
linetype = 'dotted', size = 0.8, alpha = 0.8)
if (shade_rejection) {
p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
xlim = c(cv_upper, 4),
geom = 'area', alpha = 0.3, fill = 'red')
}
if (shade_pval) {
p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
xlim = c(if_else(alternative == 'two.sided', abs(t), t), 4),
geom = 'area', alpha = 0.3, fill = 'blue')
}
}
# Legend text
legend_text <- c('t-statistics / p-value', 'critical value / alpha')
if (stacked) {
legend_text <- c(str_c('t-statistics: ', round(t, 2),
'\n(p-value: ', str_extract(pval, '[\\d.-]+$'), ')'),
str_c('Critical value: ', cv_legend,
'\n(alpha: ', round(alpha, 2), ')'))
}
stats_text <- c(str_c('t-statistics: ', round(t, 2)),
str_c('SE: ', round(std_err, 2)),
str_c('p-value: ', pval),
str_c('Critical value: ', cv_legend),
str_c('alpha: ', round(alpha, 2)))
if (!stacked) {
p <- p +
annotate('text', size = 9*5/14, x = 4.84, y = 0.14, hjust = 0,
label = 'bold(Statistics)', parse = T) +
annotate('text', size = 8*5/14, x = 4.89, y = 0:4 * -0.015 + 0.12, hjust = 0,
label = stats_text)
}
# Label plot
p <- p +
geom_vline(aes(xintercept = t, color = 'tstat'),
linetype = 'dotted', size = 0.8, alpha = 0.8) +
scale_x_continuous(sec.axis = sec_axis(trans = ~ . * std_err + mu)) +
scale_color_manual(name = if_else(stacked, 'Statistics', 'Legend'),
breaks = c('tstat', 'cval'),
labels = legend_text,
values = c(tstat = 'blue', cval = 'red')) +
theme(plot.title = element_text(size = 10, face = 'bold', hjust = 0.5),
plot.margin = unit(c(5.5, if_else(stacked, 5.5, 30), 5.5, 5.5), 'pt'),
legend.title = element_text(size = 9, face = 'bold'),
legend.text = element_text(size = 8)) +
coord_cartesian(xlim = c(-4, 4),
clip = 'off')
p
}What is IPEDS?
Which IPEDS variables will we be using to teach statistical inference
Why use IPEDS data rather than College Scorecard data? and why these variables?
Some definitions related to tuition, fees, expenses, etc; from the IPEDS “Student Charges for Full Academic Year” 2019-20 academic year data dictionary [LINK]:
Note: the IPEDS measures of full-time graduate tuition (both in-state and out-of-state) are “average” tuition price across different graduate degree programs (excluding “first-professional” degree programs like law and medicine)
Note: “required fees” do not include the cost of healthcare (I think)
df_ipeds_popShow variable labels
df_ipeds_pop %>% var_label()#> $instnm
#> [1] "Institution (entity) name"
#>
#> $unitid
#> [1] "Unique identification number of the institution"
#>
#> $opeid6
#> [1] "First 6 digits of OPEID"
#>
#> $opeid
#> [1] "Office of Postsecondary Education (OPE) ID Number"
#>
#> $control
#> [1] "Control of institution"
#>
#> $c15basic
#> [1] "Carnegie Classification 2015: Basic"
#>
#> $stabbr
#> [1] "State abbreviation"
#>
#> $city
#> [1] "City location of institution"
#>
#> $zip
#> [1] "ZIP code"
#>
#> $locale
#> [1] "Degree of urbanization (Urban-centric locale)"
#>
#> $region
#> [1] "Bureau of Economic Analysis (BEA) regions"
#>
#> $tuit_grad_res
#> [1] "In-state average tuition full-time graduates"
#>
#> $fee_grad_res
#> [1] "In-state required fees for full-time graduates"
#>
#> $tuit_grad_nres
#> [1] "Out-of-state average tuition full-time graduates"
#>
#> $fee_grad_nres
#> [1] "Out-of-state required fees for full-time graduates"
#>
#> $tuit_md_res
#> [1] "Medicine: In-state tuition"
#>
#> $fee_md_res
#> [1] "Medicine: In-state required fees"
#>
#> $tuit_md_nres
#> [1] "Medicine: Out-of-state tuition"
#>
#> $fee_md_nres
#> [1] "Medicine: Out-of-state required fees"
#>
#> $tuit_law_res
#> [1] "Law: In-state tuition"
#>
#> $fee_law_res
#> [1] "Law: In-state required fees"
#>
#> $tuit_law_nres
#> [1] "Law: Out-of-state tuition"
#>
#> $fee_law_nres
#> [1] "Law: Out-of-state required fees"
#>
#> $books_supplies
#> [1] "Books and supplies 2019-20"
#>
#> $roomboard_off
#> [1] "Off campus (not with family), room and board 2019-20"
#>
#> $oth_expense_off
#> [1] "Off campus (not with family), other expenses 2019-20"
#>
#> $tuitfee_grad_res
#> [1] "graduate, full-time, resident; avg tuition + required fees"
#>
#> $tuitfee_grad_nres
#> [1] "graduate, full-time, non-resident; avg tuition + required fees"
#>
#> $tuitfee_md_res
#> [1] "MD, full-time, state resident; avg tuition + required fees"
#>
#> $tuitfee_md_nres
#> [1] "MD, full-time, non-resident; avg tuition + required fees"
#>
#> $tuitfee_law_res
#> [1] "Law, full-time, state resident; avg tuition + required fees"
#>
#> $tuitfee_law_nres
#> [1] "Law, full-time, non-resident; avg tuition + required fees"
#>
#> $coa_grad_res
#> [1] "graduate, full-time, state resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses"
#>
#> $coa_grad_nres
#> [1] "graduate, full-time, non-resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses"
#>
#> $coa_md_res
#> [1] "MD, full-time, state resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses"
#>
#> $coa_md_nres
#> [1] "MD, full-time, non-resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses"
#>
#> $coa_law_res
#> [1] "Law, full-time, state resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses"
#>
#> $coa_law_nres
#> [1] "Law, full-time, non-resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses"
Show value labels for variables that are labelled class (code note run)
df_ipeds_pop %>% select(control,locale,region,c15basic) %>% val_labels()Investigate data structure
df_ipeds_pop %>% group_by(unitid) %>% summarise(n_per_key=n()) %>% ungroup() %>% count(n_per_key)#> # A tibble: 1 x 2
#> n_per_key n
#> <int> <int>
#> 1 1 991
Graduate, state residents
df_ipeds_pop %>%
# keep UC campuses
filter(unitid %in% c(110398,110635,110644,110653,110662,110671,110680,110699,110705,110714,445188,110699,110398)) %>%
select(instnm,unitid,tuit_grad_res,fee_grad_res,tuitfee_grad_res,books_supplies,roomboard_off,oth_expense_off,coa_grad_res)#> # A tibble: 9 x 9
#> instnm unitid tuit_grad_res fee_grad_res tuitfee_grad_res books_supplies roomboard_off oth_expense_off coa_grad_res
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 University of California-Berkeley 110635 11442 2745 14187 870 14771 5359 35187
#> 2 University of California-Davis 110644 11442 2156 13598 1158 10588 4856 30200
#> 3 University of California-Irvine 110653 11442 1907 13349 1390 12861 5184 32784
#> 4 University of California-Los Angeles 110662 11442 1511 12953 1463 14303 5126 33845
#> 5 University of California-Riverside 110671 11442 2142 13584 1436 10986 4792 30798
#> 6 University of California-San Diego 110680 11442 2013 13455 1128 13681 4760 33024
#> 7 University of California-Santa Barbara 110705 11442 2112 13554 1184 12818 6045 33601
#> 8 University of California-Santa Cruz 110714 11442 2366 13808 1085 13216 5442 33551
#> 9 University of California-Merced 445188 11442 1765 13207 1016 8595 4909 27727
Graduate, non-resident
df_ipeds_pop %>%
# keep UC campuses
filter(unitid %in% c(110398,110635,110644,110653,110662,110671,110680,110699,110705,110714,445188,110699,110398)) %>%
select(instnm,unitid,tuit_grad_nres,fee_grad_nres,tuitfee_grad_nres,books_supplies,roomboard_off,oth_expense_off,coa_grad_nres)#> # A tibble: 9 x 9
#> instnm unitid tuit_grad_nres fee_grad_nres tuitfee_grad_nres books_supplies roomboard_off oth_expense_off coa_grad_nres
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 University of California-Berkeley 110635 26544 2745 29289 870 14771 5359 50289
#> 2 University of California-Davis 110644 26544 2156 28700 1158 10588 4856 45302
#> 3 University of California-Irvine 110653 26544 1907 28451 1390 12861 5184 47886
#> 4 University of California-Los Angeles 110662 26544 1511 28055 1463 14303 5126 48947
#> 5 University of California-Riverside 110671 26544 2142 28686 1436 10986 4792 45900
#> 6 University of California-San Diego 110680 26544 2013 28557 1128 13681 4760 48126
#> 7 University of California-Santa Barbara 110705 26544 2112 28656 1184 12818 6045 48703
#> 8 University of California-Santa Cruz 110714 26544 2366 28910 1085 13216 5442 48653
#> 9 University of California-Merced 445188 26544 1765 28309 1016 8595 4909 42829
MD, state resident
df_ipeds_pop %>%
# keep UC campuses
filter(unitid %in% c(110398,110635,110644,110653,110662,110671,110680,110699,110705,110714,445188,110699,110398)) %>%
select(instnm,unitid,tuit_md_res,fee_md_res,tuitfee_md_res,books_supplies,roomboard_off,oth_expense_off,coa_md_res)#> # A tibble: 9 x 9
#> instnm unitid tuit_md_res fee_md_res tuitfee_md_res books_supplies roomboard_off oth_expense_off coa_md_res
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 University of California-Berkeley 110635 34521 2745 37266 870 14771 5359 58266
#> 2 University of California-Davis 110644 34526 2156 36682 1158 10588 4856 53284
#> 3 University of California-Irvine 110653 34527 1907 36434 1390 12861 5184 55869
#> 4 University of California-Los Angeles 110662 36374 1511 37885 1463 14303 5126 58777
#> 5 University of California-Riverside 110671 35430 2142 37572 1436 10986 4792 54786
#> 6 University of California-San Diego 110680 34977 2013 36990 1128 13681 4760 56559
#> 7 University of California-Santa Barbara 110705 NA NA NA 1184 12818 6045 NA
#> 8 University of California-Santa Cruz 110714 NA NA NA 1085 13216 5442 NA
#> 9 University of California-Merced 445188 NA NA NA 1016 8595 4909 NA
MD, non-resident
df_ipeds_pop %>%
# keep UC campuses
filter(unitid %in% c(110398,110635,110644,110653,110662,110671,110680,110699,110705,110714,445188,110699,110398)) %>%
select(instnm,unitid,tuit_md_nres,fee_md_nres,tuitfee_md_nres,books_supplies,roomboard_off,oth_expense_off,coa_md_nres)#> # A tibble: 9 x 9
#> instnm unitid tuit_md_nres fee_md_nres tuitfee_md_nres books_supplies roomboard_off oth_expense_off coa_md_nres
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 University of California-Berkeley 110635 46766 2745 49511 870 14771 5359 70511
#> 2 University of California-Davis 110644 46771 2156 48927 1158 10588 4856 65529
#> 3 University of California-Irvine 110653 46772 1907 48679 1390 12861 5184 68114
#> 4 University of California-Los Angeles 110662 48619 1511 50130 1463 14303 5126 71022
#> 5 University of California-Riverside 110671 47675 2142 49817 1436 10986 4792 67031
#> 6 University of California-San Diego 110680 47222 2013 49235 1128 13681 4760 68804
#> 7 University of California-Santa Barbara 110705 NA NA NA 1184 12818 6045 NA
#> 8 University of California-Santa Cruz 110714 NA NA NA 1085 13216 5442 NA
#> 9 University of California-Merced 445188 NA NA NA 1016 8595 4909 NA
Law, state resident
df_ipeds_pop %>%
# keep UC campuses
filter(unitid %in% c(110398,110635,110644,110653,110662,110671,110680,110699,110705,110714,445188,110699,110398)) %>%
select(instnm,unitid,tuit_law_res,fee_law_res,tuitfee_law_res,books_supplies,roomboard_off,oth_expense_off,coa_law_res)#> # A tibble: 9 x 9
#> instnm unitid tuit_law_res fee_law_res tuitfee_law_res books_supplies roomboard_off oth_expense_off coa_law_res
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 University of California-Berkeley 110635 49242 2745 51987 870 14771 5359 72987
#> 2 University of California-Davis 110644 47332 2156 49488 1158 10588 4856 66090
#> 3 University of California-Irvine 110653 45578 1907 47485 1390 12861 5184 66920
#> 4 University of California-Los Angeles 110662 45578 1511 47089 1463 14303 5126 67981
#> 5 University of California-Riverside 110671 NA NA NA 1436 10986 4792 NA
#> 6 University of California-San Diego 110680 NA NA NA 1128 13681 4760 NA
#> 7 University of California-Santa Barbara 110705 NA NA NA 1184 12818 6045 NA
#> 8 University of California-Santa Cruz 110714 NA NA NA 1085 13216 5442 NA
#> 9 University of California-Merced 445188 NA NA NA 1016 8595 4909 NA
Law, non-resident
df_ipeds_pop %>%
# keep UC campuses
filter(unitid %in% c(110398,110635,110644,110653,110662,110671,110680,110699,110705,110714,445188,110699,110398)) %>%
select(instnm,unitid,tuit_law_nres,fee_law_nres,tuitfee_law_nres,books_supplies,roomboard_off,oth_expense_off,coa_law_nres)#> # A tibble: 9 x 9
#> instnm unitid tuit_law_nres fee_law_nres tuitfee_law_nres books_supplies roomboard_off oth_expense_off coa_law_nres
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 University of California-Berkeley 110635 52571 2745 55316 870 14771 5359 76316
#> 2 University of California-Davis 110644 56435 2156 58591 1158 10588 4856 75193
#> 3 University of California-Irvine 110653 51641 1907 53548 1390 12861 5184 72983
#> 4 University of California-Los Angeles 110662 51641 1511 53152 1463 14303 5126 74044
#> 5 University of California-Riverside 110671 NA NA NA 1436 10986 4792 NA
#> 6 University of California-San Diego 110680 NA NA NA 1128 13681 4760 NA
#> 7 University of California-Santa Barbara 110705 NA NA NA 1184 12818 6045 NA
#> 8 University of California-Santa Cruz 110714 NA NA NA 1085 13216 5442 NA
#> 9 University of California-Merced 445188 NA NA NA 1016 8595 4909 NA
Some fancy-pants private universities
unitid == 123961unitid == 243744unitid == 190150unitid == 196468unitid == 193900unitid == 166027unitid == 221999unitid == 215062unitid == 147767unitid == 162928Graduate students
books_supplies, roomboard_off, oth_expense_offroomboard_off and oth_expense_off# In-state
df_ipeds_pop %>%
# keep private fancy pants
filter(unitid %in% c(123961,243744,190150,196468,193900,166027,221999,215062,147767,162928)) %>%
select(instnm,unitid,tuit_grad_res,fee_grad_res,tuitfee_grad_res,books_supplies,roomboard_off,oth_expense_off,coa_grad_res)#> # A tibble: 6 x 9
#> instnm unitid tuit_grad_res fee_grad_res tuitfee_grad_res books_supplies roomboard_off oth_expense_off coa_grad_res
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 University of Southern California 123961 46272 835 47107 1200 15912 2152 66371
#> 2 Northwestern University 147767 56067 500 56567 1638 17019 3306 78530
#> 3 Johns Hopkins University 162928 55350 2206 57556 1250 12235 1118 72159
#> 4 Columbia University in the City of New York 190150 47600 2368 49968 1294 17955 5220 74437
#> 5 New York University 193900 33408 2154 35562 752 18684 2758 57756
#> 6 University of Pennsylvania 215062 36254 3928 40182 1358 14406 1946 57892
# Out-of-state
df_ipeds_pop %>%
# keep private fancy pants
filter(unitid %in% c(123961,243744,190150,196468,193900,166027,221999,215062,147767,162928)) %>%
select(instnm,unitid,tuit_grad_nres,fee_grad_nres,tuitfee_grad_nres,books_supplies,roomboard_off,oth_expense_off,coa_grad_nres)#> # A tibble: 6 x 9
#> instnm unitid tuit_grad_nres fee_grad_nres tuitfee_grad_nres books_supplies roomboard_off oth_expense_off coa_grad_nres
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 University of Southern California 123961 46272 835 47107 1200 15912 2152 66371
#> 2 Northwestern University 147767 56067 500 56567 1638 17019 3306 78530
#> 3 Johns Hopkins University 162928 55350 2206 57556 1250 12235 1118 72159
#> 4 Columbia University in the City of New York 190150 47600 2368 49968 1294 17955 5220 74437
#> 5 New York University 193900 33408 2154 35562 752 18684 2758 57756
#> 6 University of Pennsylvania 215062 36254 3928 40182 1358 14406 1946 57892
MD students
df_ipeds_pop %>%
# keep private fancy pants
filter(unitid %in% c(123961,243744,190150,196468,193900,166027,221999,215062,147767,162928)) %>%
select(instnm,unitid,tuit_md_res,fee_md_res,tuitfee_md_res,books_supplies,roomboard_off,oth_expense_off,coa_md_res)#> # A tibble: 6 x 9
#> instnm unitid tuit_md_res fee_md_res tuitfee_md_res books_supplies roomboard_off oth_expense_off coa_md_res
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 University of Southern California 123961 64538 3065 67603 1200 15912 2152 86867
#> 2 Northwestern University 147767 62088 828 62916 1638 17019 3306 84879
#> 3 Johns Hopkins University 162928 54900 4909 59809 1250 12235 1118 74412
#> 4 Columbia University in the City of New York 190150 62980 2340 65320 1294 17955 5220 89789
#> 5 New York University 193900 55645 3495 59140 752 18684 2758 81334
#> 6 University of Pennsylvania 215062 59910 5378 65288 1358 14406 1946 82998
Law students
df_ipeds_pop %>%
# keep private fancy pants
filter(unitid %in% c(123961,243744,190150,196468,193900,166027,221999,215062,147767,162928)) %>%
select(instnm,unitid,tuit_law_res,fee_law_res,tuitfee_law_res,books_supplies,roomboard_off,oth_expense_off,coa_law_res)#> # A tibble: 6 x 9
#> instnm unitid tuit_law_res fee_law_res tuitfee_law_res books_supplies roomboard_off oth_expense_off coa_law_res
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 University of Southern California 123961 66306 885 67191 1200 15912 2152 86455
#> 2 Northwestern University 147767 66506 300 66806 1638 17019 3306 88769
#> 3 Johns Hopkins University 162928 NA NA NA 1250 12235 1118 NA
#> 4 Columbia University in the City of New York 190150 69896 2464 72360 1294 17955 5220 96829
#> 5 New York University 193900 66036 2676 68712 752 18684 2758 90906
#> 6 University of Pennsylvania 215062 63610 4996 68606 1358 14406 1946 86316
Number of institutions by type
# Number of institutions
df_ipeds_pop %>% count()#> # A tibble: 1 x 1
#> n
#> <int>
#> 1 991
# Number of institutions by Carnegie type
df_ipeds_pop %>% count(unclass(c15basic)) %>% as_factor()#> # A tibble: 6 x 2
#> `unclass(c15basic)` n
#> <dbl> <int>
#> 1 15 102
#> 2 16 98
#> 3 17 95
#> 4 18 374
#> 5 19 199
#> 6 20 123
# Number of institutions by public/private
df_ipeds_pop %>% count(unclass(control)) %>% as_factor()#> # A tibble: 3 x 2
#> `unclass(control)` n
#> <dbl> <int>
#> 1 1 459
#> 2 2 487
#> 3 3 45
# number of institutions by public/private and carnegie type
df_ipeds_pop %>% count(unclass(control),unclass(c15basic)) %>% as_factor()#> # A tibble: 16 x 3
#> `unclass(control)` `unclass(c15basic)` n
#> <dbl> <dbl> <int>
#> 1 1 15 79
#> 2 1 16 74
#> 3 1 17 38
#> 4 1 18 159
#> 5 1 19 68
#> 6 1 20 41
#> 7 2 15 23
#> 8 2 16 24
#> 9 2 17 50
#> 10 2 18 193
#> 11 2 19 121
#> 12 2 20 76
#> 13 3 17 7
#> 14 3 18 22
#> 15 3 19 10
#> 16 3 20 6
# number of institutions by level of urbanization
df_ipeds_pop %>% count(unclass(locale)) %>% as_factor()#> # A tibble: 12 x 2
#> `unclass(locale)` n
#> <dbl> <int>
#> 1 11 252
#> 2 12 142
#> 3 13 147
#> 4 21 195
#> 5 22 25
#> 6 23 27
#> 7 31 25
#> 8 32 84
#> 9 33 65
#> 10 41 17
#> 11 42 8
#> 12 43 4
# number of institutions by public/private and level of urbanization
df_ipeds_pop %>% count(unclass(control),unclass(locale)) %>% as_factor() %>% head(10)#> # A tibble: 10 x 3
#> `unclass(control)` `unclass(locale)` n
#> <dbl> <dbl> <int>
#> 1 1 11 95
#> 2 1 12 67
#> 3 1 13 78
#> 4 1 21 57
#> 5 1 22 17
#> 6 1 23 16
#> 7 1 31 13
#> 8 1 32 51
#> 9 1 33 54
#> 10 1 41 6
Tuition+fees by public/private and Carnegie type
df_ipeds_pop %>% group_by(unclass(control),unclass(c15basic)) %>%
summarize(
sample_size = n(),
n_nonmiss_tuitfee_res = sum(!is.na(tuitfee_grad_res)),
mean_tuitfee_res = mean(tuitfee_grad_res, na.rm = TRUE),
n_nonmiss_tuitfee_nres = sum(!is.na(tuitfee_grad_nres)),
mean_tuitfee_nres = mean(tuitfee_grad_nres, na.rm = TRUE),
) %>% as_factor()#> # A tibble: 16 x 7
#> # Groups: unclass(control) [3]
#> `unclass(control)` `unclass(c15basic)` sample_size n_nonmiss_tuitfee_res mean_tuitfee_res n_nonmiss_tuitfee_nres mean_tuitfee_nres
#> <dbl> <dbl> <int> <int> <dbl> <int> <dbl>
#> 1 1 15 79 79 13635. 79 28419.
#> 2 1 16 74 74 11010. 74 22425.
#> 3 1 17 38 38 9735. 38 19672.
#> 4 1 18 159 159 10241. 159 18280.
#> 5 1 19 68 68 8909. 68 16047.
#> 6 1 20 41 41 9567. 41 17019.
#> 7 2 15 23 23 45740. 23 45740.
#> 8 2 16 24 24 30842. 24 30842.
#> 9 2 17 50 50 20525. 50 20525.
#> 10 2 18 193 193 16842. 193 16842.
#> 11 2 19 121 121 14498. 121 14498.
#> 12 2 20 76 76 14159. 76 14159.
#> 13 3 17 7 7 13066. 7 13066.
#> 14 3 18 22 22 14110. 22 14110.
#> 15 3 19 10 10 16398. 10 16398.
#> 16 3 20 6 6 19569. 6 19933.
Cost of attendance by public/private and Carnegie type
df_ipeds_pop %>% group_by(unclass(control),unclass(c15basic)) %>%
summarize(
sample_size = n(),
n_nonmiss_coa_res = sum(!is.na(coa_grad_res)),
mean_coa_res = mean(coa_grad_res, na.rm = TRUE),
n_nonmiss_coa_nres = sum(!is.na(coa_grad_nres)),
mean_coa_nres = mean(coa_grad_nres, na.rm = TRUE),
) %>% as_factor()#> # A tibble: 16 x 7
#> # Groups: unclass(control) [3]
#> `unclass(control)` `unclass(c15basic)` sample_size n_nonmiss_coa_res mean_coa_res n_nonmiss_coa_nres mean_coa_nres
#> <dbl> <dbl> <int> <int> <dbl> <int> <dbl>
#> 1 1 15 79 79 30388. 79 45173.
#> 2 1 16 74 74 27004. 74 38419.
#> 3 1 17 38 38 25972. 38 35909.
#> 4 1 18 159 159 26449. 159 34487.
#> 5 1 19 68 68 24017. 68 31155.
#> 6 1 20 41 41 24417. 41 31870.
#> 7 2 15 23 23 64642. 23 64642.
#> 8 2 16 24 24 48278. 24 48278.
#> 9 2 17 50 50 37475. 50 37475.
#> 10 2 18 193 193 32363. 193 32363.
#> 11 2 19 121 121 29331. 121 29331.
#> 12 2 20 76 76 28431. 76 28431.
#> 13 3 17 7 7 25133 7 25133
#> 14 3 18 22 22 28743. 22 28743.
#> 15 3 19 10 10 32577. 10 32577.
#> 16 3 20 6 6 35184. 6 35548.
Definitions
In probability theory and statistics, a probability distribution is the mathematical function that gives the probabilities of occurrence of different possible outcomes
A little easier to get your head around “frequency distribution” and “relative frequency distribution”
We can show frequency distributions (or relative frequency distributions) as a table or as a graph
tuit_grad_res (for first few values)df_ipeds_pop %>% count(tuitfee_grad_res) %>% head(10)#> # A tibble: 10 x 2
#> tuitfee_grad_res n
#> <dbl> <int>
#> 1 945 1
#> 2 2320 1
#> 3 3552 3
#> 4 3620 1
#> 5 3710 1
#> 6 3915 1
#> 7 4140 1
#> 8 4146 1
#> 9 4172 1
#> 10 4356 1
We can also visualize the distribution of the variable tuitfee_grad_res.
Below, we call a function named plot_distribution() that we will call on below to plot variables. Our plot_distribution() function calls functions from the ggplot2 library, which is part of the tidyverse
Call the plot_distribution() function to plot the variable tuitfee_grad_res from the data frame df_ipeds_pop. Essentially, the plot_distribution() function creates the following things:
geom_density() function – is a “smoothed version of the histogram. This is a useful alternative to the histogram for continuous data that comes from an underlying smooth distribution”plot_distribution(df_ipeds_pop, 'tuitfee_grad_res')The distributions of variables fall into a few general categories
In order to explain these concepts, we will generate a normally distributed variable, a right-skewed variable, and a left-skewed variable
def_generated_popWe can refer to variables in the data frame df_generated_pop using the following syntax:
data_frame_name$variable_namedf_generated_pop$norm_distExamine the variable norm_dist in the data frame df_generated_pop
length(df_generated_pop$norm_dist) # length() = number of "elements" = number of observations#> [1] 10000
#mean
mean(df_generated_pop$norm_dist, na.rm = TRUE)#> [1] 49.98631
Normal distribution
We generated a variable df_generated_pop$norm_dist that has a normal distribution and then plot the variable to visualize what a normal distribution looks
df_generated_pop$norm_dist
We can also visualize the variable df_generated_pop$norm_dist, as shown below. Note the following:
plot_distribution(df_generated_pop, 'norm_dist')Skewed distribution
Right skewed distributions
We generated the right-skewed variable df_generated_pop$rskew_dist
df_generated_pop$rskew_dist, as shown below. Note the following:
plot_distribution(df_generated_pop, 'rskew_dist')Left skewed distributions
We generated the left-skewed variable df_generated_pop$rskew_dist
df_generated_pop$rskew_dist, as shown below. Note the following:
Create and plot left-skewed variable
plot_distribution(df_generated_pop, 'lskew_dist')Having plotted generated variables that have normal, right skewed, and left skewed distributions, respectively, let’s plot the real-life variable tuitfee_grad_res from the data frame df_ipeds_pop (below). How would you diagnose the distribution of tuitfee_grad_res?
plot_distribution(df_ipeds_pop, 'tuitfee_grad_res')How would you diagnose the distribution of tuitfee_grad_res?
The normal distribution – symmetric, bell-shaped, mean equal to the median - has very useful properties that make it the basis of inferential statistics
plot_distribution(df_generated_pop, 'norm_dist')Recall our primary measure of dispersion, standard deviation, which measures how far away individual observations tend to be from the mean.
The empirical rule states that if variable has an approximately normal distribution (i.e., approximately “bell shaped”) then:
Why is the empirical rule so important for inferential statistics?
norm_dist has a mean of about 50 and a standard deviation of about 5, so the value of 40 would be about two standard deviations below the mean. the empirical rule tells us that only about 2.5% of observations would have a value less than 40The “z-score” of an observation is the number of standard deviations away from the mean
z-score formula
Calculating z-score for the variable norm_dist from data frame df_generated_pop
# components of z-score
mean(df_generated_pop$norm_dist, na.rm = TRUE)#> [1] 49.98631
sd(df_generated_pop$norm_dist, na.rm = TRUE)#> [1] 4.991961
#create new variable z_norm_dist
df_generated_pop <- df_generated_pop %>% mutate(
z_norm_dist = (norm_dist - mean(norm_dist, na.rm = TRUE))/sd(norm_dist, na.rm = TRUE)
)
#list a few observations
df_generated_pop %>% select(norm_dist,z_norm_dist) %>% head(10)#> # A tibble: 10 x 2
#> norm_dist z_norm_dist
#> <dbl> <dbl>
#> 1 43.1 -1.38
#> 2 50.2 0.0411
#> 3 46.2 -0.762
#> 4 51.1 0.215
#> 5 57.1 1.43
#> 6 53.7 0.748
#> 7 53.5 0.704
#> 8 48.9 -0.227
#> 9 51.0 0.200
#> 10 56.0 1.21
# mean of z-score variable
round(mean(df_generated_pop$z_norm_dist, na.rm = TRUE), digits = 4)#> [1] 0
Plot the new z-score variable, which has:
01plot_distribution(df_generated_pop, 'z_norm_dist')Recall empirical rule for normally distributed variables
Show picture from old “chicken scratch” pdf conveying ideas about z-score and empirical rule
Delete variable z_norm_dist
df_generated_pop$z_norm_dist <- NULLDefinition:
0 and a standard deviation of 1Above, we created a variable stdnorm_dist in the data frame df_generated_pop that has a standard normal distribution. Let’s investigate and plot this variable:
mean(df_generated_pop$stdnorm_dist, na.rm = TRUE)#> [1] -0.002737966
sd(df_generated_pop$stdnorm_dist, na.rm = TRUE)#> [1] 0.9983922
plot_distribution(df_generated_pop, 'stdnorm_dist')Commentary about standard normal distribution:
Question: If we have a variable with a roughly normal distribution (i.e., symmetrical), how could we transform it into a variable with a standard normal distribution (i.e., symmetrical with mean=0 and std deviation=1)?
# create variable that has a standard normal distribution from a variable that has a normal distribution
df_generated_pop %>% mutate(
z_norm_dist = (norm_dist - mean(norm_dist, na.rm = TRUE))/sd(norm_dist, na.rm = TRUE)
) %>% head(10)#> # A tibble: 10 x 5
#> norm_dist rskew_dist lskew_dist stdnorm_dist z_norm_dist
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 43.1 0.0827 0.917 -1.39 -1.38
#> 2 50.2 0.294 0.706 0.0383 0.0411
#> 3 46.2 0.156 0.844 -0.763 -0.762
#> 4 51.1 0.331 0.669 0.212 0.215
#> 5 57.1 0.651 0.349 1.43 1.43
#> 6 53.7 0.460 0.540 0.744 0.748
#> 7 53.5 0.448 0.552 0.700 0.704
#> 8 48.9 0.242 0.758 -0.229 -0.227
#> 9 51.0 0.327 0.673 0.197 0.200
#> 10 56.0 0.588 0.412 1.21 1.21
The “sampling distribution” is the fundamental concept of inferential statistics
Briefly, recall the goal of inferential statistics:
Usually, we collect a single sample from the population. How do we know if the sample we collected is representative of the underlying population we want to make statements about?
For example, for our variable norm_dist from the data frame df_generated_pop, we randomly draw 30 observations from a population of 10,000 observations
set.seed(321)
norm_dist_s1 <- sample(x = df_generated_pop$norm_dist, size = 30)
mean(df_generated_pop$norm_dist)#> [1] 49.98631
mean(norm_dist_s1) # mean of our sample#> [1] 50.19565
But, what if we had obtained a different random sample?
set.seed(123)
norm_dist_s1 <- sample(x = df_generated_pop$norm_dist, size = 30)
mean(df_generated_pop$norm_dist)#> [1] 49.98631
mean(norm_dist_s1) # mean of our sample#> [1] 49.22062
# remove object norm_dist_s1
rm(norm_dist_s1)So we can see that the sample mean, \(\bar{x}\), changes from sample to sample
Imagine if we take 1,000 random samples of size n (e.g., 30) from a population
Sampling distribution (of the sample mean)
Excellent website for understanding how the sampling distribution works
The sampling distribution of any statistic
Plotting a single random sample and a sampling distribution
df_generated_pop and df_ipeds_pop contain all observations in the populationPlot the variable norm_dist from the sample dataset
plot_distribution(df_generated_sample, 'norm_dist')Plot the variable tuitfee_grad_res from the sample IPEDS dataset
plot_distribution(df_ipeds_sample, 'tuitfee_grad_res')Plot the sampling distribution of of the sample mean for the normally distributed variable norm_dist
plot_distribution(df_generated_pop, 'norm_dist', sampling_dist = T)Plot the sampling distribution of of the sample mean for the right-skewed variable rskew_dist
plot_distribution(df_generated_pop, 'rskew_dist', sampling_dist = T)Create visualization that stacks three plots on top of one another: 1= population distribution; 2 = distribution of a single random sample; 3 = sampling distribution
norm_distplot_distribution(df_generated_pop, 'norm_dist', plot_title = 'Population distribution') +
plot_distribution(df_generated_sample, 'norm_dist', plot_title = 'Single sample distribution') +
plot_distribution(df_generated_pop, 'norm_dist', sampling_dist = T,
plot_title = 'Sampling distribution') +
plot_layout(ncol = 1)Create visualization that stacks three plots on top of one another: 1= population distribution; 2 = distribution of a single random sample; 3 = sampling distribution
rskew_distplot_distribution(df_generated_pop, 'rskew_dist', plot_title = 'Population distribution') +
plot_distribution(df_generated_sample, 'rskew_dist', plot_title = 'Single sample distribution') +
plot_distribution(df_generated_pop, 'rskew_dist', sampling_dist = T,
plot_title = 'Sampling distribution') +
plot_layout(ncol = 1)Create visualization that stacks three plots on top of one another: 1= population distribution; 2 = distribution of a single random sample; 3 = sampling distribution
tuitfee_grad_res from IPEDSplot_distribution(df_ipeds_pop, 'tuitfee_grad_res', plot_title = 'Population distribution') +
plot_distribution(df_ipeds_sample, 'tuitfee_grad_res', plot_title = 'Single sample distribution') +
plot_distribution(df_ipeds_pop, 'tuitfee_grad_res', sampling_dist = T,
plot_title = 'Sampling distribution') +
plot_layout(ncol = 1)Central limit theorem
What counts as a “large” sample size?
n=30 or moreWhy is central limit theorem important?
Show central limit theorem using interactive simulation
Show central limit theorem using a very skewed variable: non-resident, grad school cost of attendance
plot_distribution(df_ipeds_pop, 'tuitfee_grad_nres', plot_title = 'Population distribution') +
plot_distribution(df_ipeds_sample, 'tuitfee_grad_nres', plot_title = 'Single sample distribution') +
plot_distribution(df_ipeds_pop, 'tuitfee_grad_nres', sampling_dist = T,
plot_title = 'Sampling distribution') +
plot_layout(ncol = 1)Sample standard deviation, of some variable \(Y\)
Sample standard error, of the sample mean, \(\bar{Y}\)
Why is standard error so important?
Do we want standard error to be large or small? Why?
How to make standard error smaller?
Quantitative research in social sciences often proceeds as follows:
Some examples of research questions co-authors and I have answered over the years:
For each of these journal articles, we answered the research question by developing a “testable hypothesis” and testing that hypothesis using some statistical test
Developing testable hypothesis is central to univariate statistical analysis (one variable), bivariate statistical analysis (two variables), and multivariate statistical analysis (3+ variables, usually a regression model)
Example hypotheses for univariate, bivariate, multivariate statistical analyses
Why learn how to do hypothesis testing about a single population mean when this class is supposed to be about regression (and hypothesis tests about regression models)?
These are the general steps in hypothesis testing:
In real research projects, do researchers always follow these exact steps? In this exact order?
Example we will use to introduce steps in hypothesis testing
How we will teach you the steps in hypothesis testing in this lecture
This section presents a more formal introduction to hypotheses, focusing on univariate statistical analyses rather than bivariate or multivariate
Recall that the goal of inferential statistics is to make statements about a population of interest based on data from a representative sample from the population.
Definition
In univariate statistical analyses, we make a hypothesis about one population paramaeter (e.g., population mean \(\mu_Y\)) from one population of interest (e.g., all “research” universities and “master’s” universities, as defined by the Carnegie Classification)
df_ipeds_pop %>% head()#> # A tibble: 6 x 38
#> instnm unitid opeid6 opeid control c15basic stabbr city zip locale region tuit_grad_res fee_grad_res tuit_grad_nres fee_grad_nres tuit_md_res fee_md_res tuit_md_nres fee_md_nres tuit_law_res fee_law_res tuit_law_nres fee_law_nres books_supplies roomboard_off oth_expense_off tuitfee_grad_res tuitfee_grad_nres tuitfee_md_res tuitfee_md_nres tuitfee_law_res tuitfee_law_nres coa_grad_res coa_grad_nres coa_md_res coa_md_nres coa_law_res coa_law_nres
#> <chr> <dbl> <chr> <chr> <dbl+lbl> <dbl+lbl> <chr+lbl> <chr> <chr> <dbl+lbl> <dbl+lbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Alabama A & M University 100654 001002 00100200 1 [Public] 18 [Master^s Colleges & Universities: Larger Programs] AL [Alabama] Normal 35762 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 10128 1414 20160 1414 NA NA NA NA NA NA NA NA 1600 9240 3090 11542 21574 NA NA NA NA 25472 35504 NA NA NA NA
#> 2 University of Alabama at Birmingham 100663 001052 00105200 1 [Public] 15 [Doctoral Universities: Highest Research Activity] AL [Alabama] Birmingham 35294-0110 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 8100 0 19188 0 28978 0 62714 0 NA NA NA NA 1200 12307 5555 8100 19188 28978 62714 NA NA 27162 38250 48040 81776 NA NA
#> 3 Amridge University 100690 025034 02503400 2 [Private not-for-profit] 20 [Master^s Colleges & Universities: Small Programs] AL [Alabama] Montgomery 36117-3553 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 11700 1300 11700 1300 NA NA NA NA NA NA NA NA 900 9600 1600 13000 13000 NA NA NA NA 25100 25100 NA NA NA NA
#> 4 University of Alabama in Huntsville 100706 001055 00105500 1 [Public] 16 [Doctoral Universities: Higher Research Activity] AL [Alabama] Huntsville 35899 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 10632 826 24430 826 NA NA NA NA NA NA NA NA 2120 10400 3994 11458 25256 NA NA NA NA 27972 41770 NA NA NA NA
#> 5 Alabama State University 100724 001005 00100500 1 [Public] 19 [Master^s Colleges & Universities: Medium Programs] AL [Alabama] Montgomery 36104-0271 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 7416 2740 14832 2740 NA NA NA NA NA NA NA NA 1600 7320 4228 10156 17572 NA NA NA NA 23304 30720 NA NA NA NA
#> 6 The University of Alabama 100751 001051 00105100 1 [Public] 16 [Doctoral Universities: Higher Research Activity] AL [Alabama] Tuscaloosa 35487-0100 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 10780 0 30250 0 28978 0 62714 0 23610 0 43060 0 1000 13636 4600 10780 30250 28978 62714 23610 43060 30016 49486 48214 81950 42846 62296
mean(df_ipeds_pop$coa_grad_res, na.rm = TRUE)#> [1] 30318.74
When developing a hypothesis for quantitative research, we always specify a null hypothesis (\(H_0\)) AND an alternative hypothesis (\(H_a\))
Null hypothesis (\(H_0\))
Alternative hypothesis (\(H_a\))
Two-sided alternative hypothesis
One-sided alternative hypothesis (mean is greater than \(\$28,000\))
One-sided alternative hypothesis (mean is less than \(\$28,000\))
ISSUE - EQUATIONS NOT FORMATTING CORRECTLY, EVEN WHEN I CHANGE INDENTATION ASSOCIATED WITH BULLETS
Example of null and alternative hypotheses for bivariate statistical analysis
Research question:
Null and alternative hypotheses
In real research projects, we are not usually testing a hypothesis about a single population mean (univariate analysis). Rather, we are usually comparing population means of two different groups (bivariate analysis) or we are examining the relationship between an independent variable and the dependent variable after controlling for other variables (multivariate regression analysis)
Prior to conducting analyses, we usually have an expectation/suspicion about the result
Should we specify two-sided or one-sided alternative hypothesis, \(H_a\)?
Because EDUC152 is a course about regression rather than univariate/bivariate statistics, we will always specify two-sided alternative hypotheses and ignore one-sided alternative hypotheses from here on out
restate null, \(H_0\), and alternative (two-sided), \(H_a\), hypothesis for our practical example
We must conduct a formal statistical test to decide whether we should reject the null hypothesis
Logic of the test statistic
General formula for test statistic (for pretty much any kind of hypothesis test):
\[ test\_statistic = \frac{sample\_estimate - value\_associated\_with\_H_0}{sample\_standard\_error}\] Formula for test statistic about a single population mean
Calculating t-test statistic for our practical example
\(H_0: \mu_Y = \mu_{Y0} = \$28,000\) ; \(H_a: \mu_Y \ne \$28,000\)
Calculate components of t-test (using functions and by hand)
# sample size
length(df_ipeds_sample$coa_grad_res) # assuming no missing observations#> [1] 200
df_ipeds_sample %>% summarize(n_non_miss = sum(!(is.na(coa_grad_res)))) # count only number of non-missing#> # A tibble: 1 x 1
#> n_non_miss
#> <int>
#> 1 200
# sample mean of coa_grad_res
mean(df_ipeds_sample$coa_grad_res, na.rm = TRUE) # using function#> [1] 30002.74
# sample standard deviation of coa_grad_res
sd(df_ipeds_sample$coa_grad_res, na.rm = TRUE)#> [1] 11457.01
# sample standard error of sample mean of coa_grad_res = std_dev/sqrt(n)
sd(df_ipeds_sample$coa_grad_res, na.rm = TRUE)/sqrt(length(df_ipeds_sample$coa_grad_res))#> [1] 810.1332
Components of t-test:
Calculating t-test
\[t = \frac{\bar{Y} - \mu_{Y0}}{\hat{\sigma}_{\bar{Y}}} = \frac{30002.74 - 28000}{810.1332} = 2.4721\]
t.test() function
t.test() does
t.test(x, y = NULL, alternative = c("two.sided", "less", "greater"), mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95, ...)x: vector (variable) you want to calculate t-test foralternative: whether you want two-sided or one-sided alternative hypothesis (default is two.sided)mu: value associated with null hypothesis (default is 0)Calculating t-test value (using function and by hand)
# t-statistic = (sample_mean - mu_H_0)/(sample std err)
# using function
#?t.test # to see help file for function
t.test(x = df_ipeds_sample$coa_grad_res, mu = 28000)#>
#> One Sample t-test
#>
#> data: df_ipeds_sample$coa_grad_res
#> t = 2.4721, df = 199, p-value = 0.01427
#> alternative hypothesis: true mean is not equal to 28000
#> 95 percent confidence interval:
#> 28405.20 31600.29
#> sample estimates:
#> mean of x
#> 30002.74
# by hand
(mean(df_ipeds_sample$coa_grad_res, na.rm = TRUE) - 28000)/(sd(df_ipeds_sample$coa_grad_res, na.rm = TRUE)/sqrt(length(df_ipeds_sample$coa_grad_res)))#> [1] 2.472118
The test statistic refers to a sampling distribution not the distribution of your single sample
Recall core ideas of sampling distribution (we will refer to sampling distribution of sample mean, \(\bar{Y}\))
Here we visually stack the following for the variable coa_grad_res:
plot_distribution(df_ipeds_pop, 'coa_grad_res', plot_title = 'Population distribution') +
plot_distribution(df_ipeds_sample, 'coa_grad_res', plot_title = 'Single sample distribution') +
plot_distribution(df_ipeds_pop, 'coa_grad_res', sampling_dist = T,
plot_title = 'Sampling distribution') +
plot_layout(ncol = 1)Usually we cannot know the sampling distribution because we do not have data on the entire population; we only have data on our single random sample
However, hypothesis testing is not based on the true sampling distribution of the sample mean. It is based on the sampling distribution under the assumption that the null hypothesis is correct
Here we visually stack the following for the variable coa_grad_res:
plot_distribution(df_ipeds_pop, 'coa_grad_res', plot_title = 'Population distribution') +
plot_distribution(df_ipeds_sample, 'coa_grad_res', plot_title = 'Single sample distribution') +
plot_t_distribution(df_ipeds_sample, 'coa_grad_res', mu = 28000,shade_rejection = F, shade_pval = T,
plot_title = 'Sampling distribution, assuming H_0', stacked = T) +
plot_layout(ncol = 1)The t-test statistic is the distance between the hypothesized \(H_0\) value and the observed sample estimate value \(\bar{Y}\) scaled in terms of standard errors
2 or a t-value less than -2 because we know (from empirical rule and central limit theorem) that 95% of observations fall within two standard deviations of the mean for a normally distributed variable“p-value” refers to the probability-value associated with the t-value from your test statistic
Definition
A small p-value means that it would be unusual to find the sample estimate we observed if the null hypothesis \(H_0\) is find the observed data if 𝐻0were true.
Calculating p-value For a two-sided alternative hypothesis (\(H_a: \mu_Y \ne \mu_{Y0}\))
Let’s calculate and visualize p-value for a couple different hypothesized values of the population mean \(\mu_{Y0}\) for the variable coa_grad_res (full-time, resident grad school cost of attendance) from the data frame df_ipeds_sample
\(H_0: \mu_Y = \mu_{Y0} = \$29,000\) and \(H_a: \mu_Y \ne \$29,000\)
mean(x = df_ipeds_sample$coa_grad_res)#> [1] 30002.74
t.test(x = df_ipeds_sample$coa_grad_res, mu = 29000)#>
#> One Sample t-test
#>
#> data: df_ipeds_sample$coa_grad_res
#> t = 1.2378, df = 199, p-value = 0.2173
#> alternative hypothesis: true mean is not equal to 29000
#> 95 percent confidence interval:
#> 28405.20 31600.29
#> sample estimates:
#> mean of x
#> 30002.74
plot_t_distribution(df_ipeds_sample, 'coa_grad_res', mu = 29000,shade_rejection = F, shade_pval = T)\(H_0: \mu_Y = \mu_{Y0} = \$28,000\) and \(H_a: \mu_Y \ne \$28,000\)
mean(x = df_ipeds_sample$coa_grad_res)#> [1] 30002.74
t.test(x = df_ipeds_sample$coa_grad_res, mu = 28000)#>
#> One Sample t-test
#>
#> data: df_ipeds_sample$coa_grad_res
#> t = 2.4721, df = 199, p-value = 0.01427
#> alternative hypothesis: true mean is not equal to 28000
#> 95 percent confidence interval:
#> 28405.20 31600.29
#> sample estimates:
#> mean of x
#> 30002.74
plot_t_distribution(df_ipeds_sample, 'coa_grad_res', mu = 28000,shade_rejection = F, shade_pval = T)\(H_0: \mu_Y = \mu_{Y0} = \$31,500\) and \(H_a: \mu_Y \ne \$31,500\)
mean(x = df_ipeds_sample$coa_grad_res)#> [1] 30002.74
t.test(x = df_ipeds_sample$coa_grad_res, mu = 31500)#>
#> One Sample t-test
#>
#> data: df_ipeds_sample$coa_grad_res
#> t = -1.8482, df = 199, p-value = 0.06606
#> alternative hypothesis: true mean is not equal to 31500
#> 95 percent confidence interval:
#> 28405.20 31600.29
#> sample estimates:
#> mean of x
#> 30002.74
plot_t_distribution(df_ipeds_sample, 'coa_grad_res', mu = 31500)\(\alpha\) level (referred to as “alpha level” or “rejection region”)
In practice, the most common alpha level \(\alpha\) is .05
.10 but usually this is viewed as not sufficiently strong threshold to reject \(H_0\)Usually, you define alpha level prior to running analyses
To show how alpha level is used in practice, we’ll test the null hypothesis that population mean grad resident cost of attendance is 28,500, initially using an alpha level of .05
Note that the t.test() function doesn’t have an argument that let’s you specify the alpha level (rejection region); rather, the idea is that you choose the alpha level and then compare that to the p-value calculated by t.test()
t.test(x = df_ipeds_sample$coa_grad_res, mu = 28500)#>
#> One Sample t-test
#>
#> data: df_ipeds_sample$coa_grad_res
#> t = 1.8549, df = 199, p-value = 0.06508
#> alternative hypothesis: true mean is not equal to 28500
#> 95 percent confidence interval:
#> 28405.20 31600.29
#> sample estimates:
#> mean of x
#> 30002.74
Note that the user-defined plot_t_distribution() includes an optional argument that allows you to specify the alpha-level
plot_t_distributionfunction(data_df, data_var, group_var = NULL, group_cat = NULL, mu = 0, alpha = 0.05, alternative = 'two.sided', plot_title = '', shade_rejection = T, shade_pval = T, stacked = F)alpha, which has the default value of 0.05Below, we run plot_t_distribution(), manually setting the alpha argument to the the default value of 0.05
alpha = .05, the shaded region represents .05, that is the percent of all observations that lie in the rejection regionplot_t_distribution(df_ipeds_sample, 'coa_grad_res', mu = 28500, alpha = .05)How would our conclusion change if we chose an alpha level of .10?
plot_t_distribution(df_ipeds_sample, 'coa_grad_res', mu = 28500, alpha = .10)Last step is to make a conclusion about your hypothesis based on comparing the p-value you observed to the alpha level
How to state the conclusion
This example:
28,500plot_t_distribution(df_ipeds_sample, 'coa_grad_res', mu = 28500, alpha = .05,
shade_rejection = TRUE, shade_pval = FALSE)All statistical tests (based on some statistical analysis) depends on “assumptions”
Assumptions necessary for testing hypotheses about a population mean
“Robust”
Hyptohesis tests about a population mean is “robust” to the normal distribution assumption
Central limit theorem:
Hypothesis test about population means is not robust to violations of random sampling
Research question:
tuitfee_grad_nres]Let’s imagine we want to test whether the population mean, \(\mu_Y = \$17,000\), using a two-sided alternative hypothesis and an alpha level of .05
State null and alternative hypotheses
Test statistic and p-value
Components of t-test
Calculating t-test p-value using t.test()
t.test(df_ipeds_sample$tuitfee_grad_nres, mu = 17000)#>
#> One Sample t-test
#>
#> data: df_ipeds_sample$tuitfee_grad_nres
#> t = 2.2917, df = 199, p-value = 0.02297
#> alternative hypothesis: true mean is not equal to 17000
#> 95 percent confidence interval:
#> 17225.55 20007.71
#> sample estimates:
#> mean of x
#> 18616.63
plot_t_distribution(df_ipeds_sample, 'tuitfee_grad_nres', mu = 17000, alpha = .05,
shade_rejection = TRUE, shade_pval = FALSE)Conclusion
.05, we reject \(H_0\)Finally, we usually don’t have all data on the population. But since we do for IPEDS, we can plot:
plot_distribution(df_ipeds_pop, 'tuitfee_grad_nres', plot_title = 'Population distribution') +
plot_distribution(df_ipeds_sample, 'tuitfee_grad_nres', plot_title = 'Single sample distribution') +
plot_distribution(df_ipeds_pop, 'tuitfee_grad_nres', sampling_dist = T, plot_title = 'True Sampling distribution') +
plot_t_distribution(df_ipeds_sample, 'tuitfee_grad_nres', mu = 17000, plot_title = 'Sampling distribution, assuming H_0', stacked = T) +
plot_layout(ncol = 1)text
text